# load required packages

required_packages <- c(
  "devtools", "dplyr", "estimatr", "readxl", "ggplot2", "gridExtra", "modelsummary", 
  "lubridate", "stringr", "readr", "sensemakr", "tidyr", "patchwork", 
  "ggplotify", "rdrobust", "xtable", "skimr", "knitr"
)

install_if_missing <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

# Apply the install function to each package
invisible(lapply(required_packages, install_if_missing))

# Load all packages
lapply(required_packages, library, character.only = TRUE)

# install local pwtest package

devtools::install_local("pwtest-package")

# load pwtest package

library(pwtest)

# functions

ate_lm_robust <- function(outcome, treatment) {
  
  # Create a data frame to hold the non-missing values
  data <- data.frame(outcome = outcome, treatment = treatment)
  
  # Run the linear regression using lm_robust
  model <- lm_robust(outcome ~ treatment, data = data)
  
  # Extract necessary statistics
  coef_treatment <- model$coefficients['treatment']    # Coefficient for treatment
  se_treatment <- model$std.error['treatment']         # Standard error for treatment coefficient
  p_value <- 2 * pt(-abs(coef_treatment / se_treatment), df = model$df.residual)  # Two-tailed p-value
  
  # Total number of non-missing observations in the outcome variable
  total_n <- sum(!is.na(data$outcome) & !is.na(data$treatment))
  
  # Calculate means and standard errors for each treatment group
  mean_control <- mean(outcome[treatment == 0], na.rm = TRUE)  # Mean when treatment == 0
  mean_treated <- mean(outcome[treatment == 1], na.rm = TRUE)  # Mean when treatment == 1
  
  se_control <- sd(outcome[treatment == 0], na.rm = TRUE) / sqrt(sum(treatment == 0, na.rm = TRUE))  # SE for control mean
  se_treated <- sd(outcome[treatment == 1], na.rm = TRUE) / sqrt(sum(treatment == 1, na.rm = TRUE))  # SE for treated mean
  
  # Generate the relevant output
  return(c(mean_treated = mean_treated,
           se_treated = se_treated,
           mean_control = mean_control,
           se_control = se_control,
           coef_treatment = coef_treatment,
           se_treatment = se_treatment,
           p_value = p_value,
           total_n = total_n))
}

run_ate_for_multiple_outcomes <- function(data, outcome_vars, treatment) {
  # Use sapply to apply the ate_lm_robust function to each outcome variable
  results <- sapply(outcome_vars, function(outcome_var) {
    # Call the ate_lm_robust function with the current outcome variable
    ate_lm_robust(data[[outcome_var]], data[[treatment]])
  })
  
  # Transpose the results to convert into a data frame
  results_df <- as.data.frame(t(results))
  
  # Add a column for the outcome variable names
  results_df$Outcome <- outcome_vars
  
  return(results_df)
}

run_regressions <- function(vars, data_nonclustered, data_clustered, admin_data, clustered_vars = NULL, admin_vars = NULL) {
  results <- data.frame(
    dependent_var = character(),
    direct_mean = numeric(),
    direct_mean_se = numeric(),
    indirect_mean = numeric(),
    indirect_mean_se = numeric(),
    ATE = numeric(),
    ATE_se = numeric(),
    CI_upper_95 = numeric(),
    CI_lower_95 = numeric(),
    p_value = numeric(),
    N = integer(),
    stringsAsFactors = FALSE
  )
  
  # Combine all variables
  all_vars <- unique(c(vars, clustered_vars, admin_vars))
  
  for (var in all_vars) {
    # Select the appropriate data based on the variable category
    if (var %in% clustered_vars) {
      data <- data_clustered
      model <- lm_robust(as.formula(paste(var, "~ direct")), data = data, clusters = data$r_villageid)
    } else if (var %in% admin_vars) {
      data <- admin_data
      model <- lm_robust(as.formula(paste(var, "~ direct")), data = data)
    } else {
      data <- data_nonclustered
      model <- lm_robust(as.formula(paste(var, "~ direct")), data = data)
    }
    
    # Calculate means and standard errors for the "direct" and "indirect" groups
    direct_data <- data %>% filter(direct == 1) %>% pull(!!sym(var))
    indirect_data <- data %>% filter(direct == 0) %>% pull(!!sym(var))
    
    direct_mean <- mean(direct_data, na.rm = TRUE)
    direct_mean_se <- sd(direct_data, na.rm = TRUE) / sqrt(length(direct_data[!is.na(direct_data)]))
    indirect_mean <- mean(indirect_data, na.rm = TRUE)
    indirect_mean_se <- sd(indirect_data, na.rm = TRUE) / sqrt(length(indirect_data[!is.na(indirect_data)]))
    
    # Extract regression results
    ATE <- coef(model)["direct"]
    ATE_se <- model$std.error["direct"]
    CI_upper_95 <- model$conf.high["direct"]
    CI_lower_95 <- model$conf.low["direct"]
    p_value <- model$p.value["direct"]
    N <- model$nobs
    
    # Append results to the dataframe
    results <- results %>%
      add_row(
        dependent_var = var,
        direct_mean = direct_mean,
        direct_mean_se = direct_mean_se,
        indirect_mean = indirect_mean,
        indirect_mean_se = indirect_mean_se,
        ATE = ATE,
        ATE_se = ATE_se,
        CI_upper_95 = CI_upper_95,
        CI_lower_95 = CI_lower_95,
        p_value = p_value,
        N = N
      )
  }
  
  return(results)
}

# experimental results function

run_regressions_surveyexperiment <- function(data, outcome, independent_vars) {
  # Define the controls as a formula string
  controls <- "rural + age + gender + religion + caste + education + prior_vote + knowledge_local_politics + gender_norms"
  
  results <- lapply(independent_vars, function(independent_var) {
    # Create the formula with the outcome, independent variable, and controls
    formula <- as.formula(paste(outcome, "~", independent_var, "+", controls))
    
    # Run the regression model
    model <- lm_robust(formula, data = data)
    
    # Calculate means for the outcome when independent_var == 1 and independent_var == 0
    mean_indep1 <- mean(data[[outcome]][data[[independent_var]] == 1], na.rm = TRUE)
    mean_indep0 <- mean(data[[outcome]][data[[independent_var]] == 0], na.rm = TRUE)
    
    # Extract estimates and statistics
    ate <- coef(model)[independent_var]
    ci_lower <- confint(model)[independent_var, "2.5 %"]
    ci_upper <- confint(model)[independent_var, "97.5 %"]
    p_value <- summary(model)$coefficients[independent_var, "Pr(>|t|)"]
    n_obs <- model$nobs
    
    # Return as a data frame
    data.frame(
      Independent_Var = independent_var,
      Mean_Indep_1 = mean_indep1,
      Mean_Indep_0 = mean_indep0,
      ATE = ate,
      CI_Lower = ci_lower,
      CI_Upper = ci_upper,
      P_Value = p_value,
      N = n_obs
    )
  })
  
  # Bind results into a single data frame
  do.call(rbind, results)
}


# f stat and f stat p value function

get_f_stat_pvalue <- function(model) {
  model_glance <- glance(model)
  
  # Create a data frame to return F-statistic and its p-value
  data.frame(
    "F-statistic" = round(model_glance$statistic, 3),
    "F p-value" = round(model_glance$p.value, 3)
  )
}


# pwtest function

run_pwtest <- function(outcomes, data, seed = 10032025) {
  # Set seed for reproducibility
  set.seed(seed)
  
  # Define the covariates
  covariates <- c("number_villages_GP",
                  "km_from_block_office",
                  "total_time_from_block_office",
                  "speed_to_block_office",
                  "pilgrimage_sites",
                  "historic_sites",
                  "economic_opportunities",
                  "all_villages_electricity_grid",
                  "reserved_women",
                  "reserved_obc",
                  "reserved_sc",
                  "total_population_gp",
                  "overall_percent_SC_gp")
  
  # Initialize an empty dataframe to store results
  results <- data.frame(outcome = character(), pwdelta = numeric(), pwdelta_p = numeric(), stringsAsFactors = FALSE)
  
  # Loop through each outcome variable
  for (outcome in outcomes) {
    # Run the pwtest function
    test_result <- pwtest(data, covariates = covariates, treatment = "direct", outcome = outcome)
    
    # Extract pwdelta and pwdelta_p and store in the results dataframe
    results <- rbind(results, data.frame(outcome = outcome, pwdelta = test_result$pwdelta, pwdelta_p = test_result$pwdelta_p))
  }
  
  return(results)
}
